{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "b0dc02c4",
   "metadata": {},
   "outputs": [],
   "source": [
    "bigexcel_dir =\"./course_datas/c15_excel_split_merge\"\n",
    "smallexcel_dir = f\"{bigexcel_dir}/splits\"\n",
    "\n",
    "\n",
    "import os\n",
    "if not os.path.exists(smallexcel_dir):\n",
    "    os.mkdir(smallexcel_dir)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "73a2a8ad",
   "metadata": {},
   "source": [
    "读取大excel到pandas"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "678d016d",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "\n",
    "df_bigexcel = pd.read_excel(f\"{bigexcel_dir}/crazyant_blog_articles_source.xlsx\", engine='openpyxl')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "0434c4dc",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>id</th>\n",
       "      <th>title</th>\n",
       "      <th>tags</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>2585</td>\n",
       "      <td>Tensorflow怎样接收变长列表特征</td>\n",
       "      <td>python,tensorflow,特征工程</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2583</td>\n",
       "      <td>Pandas实现数据的合并concat</td>\n",
       "      <td>pandas,python,数据分析</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>2574</td>\n",
       "      <td>Pandas的Index索引有什么用途？</td>\n",
       "      <td>pandas,python,数据分析</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>2564</td>\n",
       "      <td>机器学习常用数据集大全</td>\n",
       "      <td>python,机器学习</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>2561</td>\n",
       "      <td>一个数据科学家的修炼路径</td>\n",
       "      <td>数据分析</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "     id                 title                    tags\n",
       "0  2585  Tensorflow怎样接收变长列表特征  python,tensorflow,特征工程\n",
       "1  2583   Pandas实现数据的合并concat      pandas,python,数据分析\n",
       "2  2574  Pandas的Index索引有什么用途？      pandas,python,数据分析\n",
       "3  2564           机器学习常用数据集大全             python,机器学习\n",
       "4  2561          一个数据科学家的修炼路径                    数据分析"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_bigexcel.head(5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "f350ae50",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(258, 3)"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_bigexcel.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "05391aa7",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "258"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_bigexcel_rowcounts = df_bigexcel.shape[0]\n",
    "df_bigexcel_rowcounts"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "64ff8105",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "RangeIndex(start=0, stop=258, step=1)"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_bigexcel.index"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "e9151b60",
   "metadata": {},
   "source": [
    "拆分大excel"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "e9f9b231",
   "metadata": {},
   "outputs": [],
   "source": [
    "usernames = [\"xiaojun\",\"xiaohua\",\"xiaoyuan\",\"xiaoning\",\"xiaohao\"]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "cf611962",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "52"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "fenpeirows = df_bigexcel_rowcounts//len(usernames)\n",
    "if df_bigexcel_rowcounts%len(usernames) != 0:\n",
    "    fenpeirows += 1\n",
    "    \n",
    "fenpeirows"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "7cccd0c7",
   "metadata": {},
   "outputs": [
    {
     "ename": "IndexError",
     "evalue": "single positional indexer is out-of-bounds",
     "output_type": "error",
     "traceback": [
      "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[1;31mIndexError\u001b[0m                                Traceback (most recent call last)",
      "\u001b[1;32m~\\AppData\\Local\\Temp\\ipykernel_24408\\3812786345.py\u001b[0m in \u001b[0;36m<module>\u001b[1;34m\u001b[0m\n\u001b[0;32m      4\u001b[0m     \u001b[0mbegin_row\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0midx\u001b[0m\u001b[1;33m*\u001b[0m\u001b[0mfenpeirows\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m      5\u001b[0m     \u001b[0mend_row\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mbegin_row\u001b[0m\u001b[1;33m+\u001b[0m\u001b[0mfenpeirows\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m----> 6\u001b[1;33m     \u001b[0mdf_smallexcel_rows\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mdf_bigexcel\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0miloc\u001b[0m\u001b[1;33m[\u001b[0m\u001b[0mbegin_row\u001b[0m\u001b[1;33m,\u001b[0m\u001b[0mend_row\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m      7\u001b[0m     \u001b[0mdf_smallexcel\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mappend\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0midx\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0musername\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mdf_smallexcel_rows\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;32md:\\Anaconda3\\lib\\site-packages\\pandas\\core\\indexing.py\u001b[0m in \u001b[0;36m__getitem__\u001b[1;34m(self, key)\u001b[0m\n\u001b[0;32m    871\u001b[0m                     \u001b[1;31m# AttributeError for IntervalTree get_value\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    872\u001b[0m                     \u001b[1;32mpass\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 873\u001b[1;33m             \u001b[1;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_getitem_tuple\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mkey\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m    874\u001b[0m         \u001b[1;32melse\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    875\u001b[0m             \u001b[1;31m# we by definition only have the 0th axis\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;32md:\\Anaconda3\\lib\\site-packages\\pandas\\core\\indexing.py\u001b[0m in \u001b[0;36m_getitem_tuple\u001b[1;34m(self, tup)\u001b[0m\n\u001b[0;32m   1441\u001b[0m     \u001b[1;32mdef\u001b[0m \u001b[0m_getitem_tuple\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mtup\u001b[0m\u001b[1;33m:\u001b[0m \u001b[0mTuple\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m   1442\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m-> 1443\u001b[1;33m         \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_has_valid_tuple\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mtup\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m   1444\u001b[0m         \u001b[1;32mtry\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m   1445\u001b[0m             \u001b[1;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_getitem_lowerdim\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mtup\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;32md:\\Anaconda3\\lib\\site-packages\\pandas\\core\\indexing.py\u001b[0m in \u001b[0;36m_has_valid_tuple\u001b[1;34m(self, key)\u001b[0m\n\u001b[0;32m    700\u001b[0m                 \u001b[1;32mraise\u001b[0m \u001b[0mIndexingError\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m\"Too many indexers\"\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    701\u001b[0m             \u001b[1;32mtry\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 702\u001b[1;33m                 \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_validate_key\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mk\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mi\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m    703\u001b[0m             \u001b[1;32mexcept\u001b[0m \u001b[0mValueError\u001b[0m \u001b[1;32mas\u001b[0m \u001b[0merr\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    704\u001b[0m                 raise ValueError(\n",
      "\u001b[1;32md:\\Anaconda3\\lib\\site-packages\\pandas\\core\\indexing.py\u001b[0m in \u001b[0;36m_validate_key\u001b[1;34m(self, key, axis)\u001b[0m\n\u001b[0;32m   1350\u001b[0m             \u001b[1;32mreturn\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m   1351\u001b[0m         \u001b[1;32melif\u001b[0m \u001b[0mis_integer\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mkey\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m-> 1352\u001b[1;33m             \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_validate_integer\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mkey\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0maxis\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m   1353\u001b[0m         \u001b[1;32melif\u001b[0m \u001b[0misinstance\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mkey\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mtuple\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m   1354\u001b[0m             \u001b[1;31m# a tuple should already have been caught by this point\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;32md:\\Anaconda3\\lib\\site-packages\\pandas\\core\\indexing.py\u001b[0m in \u001b[0;36m_validate_integer\u001b[1;34m(self, key, axis)\u001b[0m\n\u001b[0;32m   1435\u001b[0m         \u001b[0mlen_axis\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mlen\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mobj\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_get_axis\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0maxis\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m   1436\u001b[0m         \u001b[1;32mif\u001b[0m \u001b[0mkey\u001b[0m \u001b[1;33m>=\u001b[0m \u001b[0mlen_axis\u001b[0m \u001b[1;32mor\u001b[0m \u001b[0mkey\u001b[0m \u001b[1;33m<\u001b[0m \u001b[1;33m-\u001b[0m\u001b[0mlen_axis\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m-> 1437\u001b[1;33m             \u001b[1;32mraise\u001b[0m \u001b[0mIndexError\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m\"single positional indexer is out-of-bounds\"\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m   1438\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m   1439\u001b[0m     \u001b[1;31m# -------------------------------------------------------------------\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;31mIndexError\u001b[0m: single positional indexer is out-of-bounds"
     ]
    }
   ],
   "source": [
    "df_smallexcel = []\n",
    "\n",
    "for idx, username in enumerate(usernames):\n",
    "    begin_row = idx*fenpeirows\n",
    "    end_row = begin_row+fenpeirows\n",
    "    df_smallexcel_rows = df_bigexcel.iloc[begin_row,end_row]\n",
    "    df_smallexcel.append((idx, username, df_smallexcel_rows))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "id": "20ae6c4e",
   "metadata": {},
   "outputs": [],
   "source": [
    "for idx,username,df_smallexcel_rows in df_smallexcel:\n",
    "    file_name = f\"{smallexcel_dir}/slipsexcel_{idx}_{username}.xlsx\"\n",
    "    df_smallexcel_rows.to_excel(file_name, index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "c84328f0",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "95ee5276",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "a450a71d",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "618bac95",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "4b87ca0e",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "9f4a4ee4",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "018ad081",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e80641ba",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "eebc4366",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.13"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
